library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library(RColorBrewer)

# read transactions dataset and convert to arules sparse matrix format
fcsv <- "https://raw.githubusercontent.com/multidis/hult-retail-analytics/main/shopping_cart/transactions_binary.csv"
df_trans <- read_csv(fcsv)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
trans <- transactions(as.matrix(df_trans))
summary(trans)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels
## 1 frankfurter
## 2     sausage
## 3  liver loaf
# rules exceeding support and confidence thresholds
rules <- apriori(trans, parameter = list(support=0.001, confidence=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# top 5 rules in terms of lift
inspect(head(sort(rules, by ="lift"), 5))
##     lhs                                   rhs              support    
## [1] {Instant food products,soda}       => {hamburger meat} 0.001220132
## [2] {soda,popcorn}                     => {salty snack}    0.001220132
## [3] {flour,baking powder}              => {sugar}          0.001016777
## [4] {ham,processed cheese}             => {white bread}    0.001931876
## [5] {whole milk,Instant food products} => {hamburger meat} 0.001525165
##     confidence coverage    lift     count
## [1] 0.6315789  0.001931876 18.99565 12   
## [2] 0.6315789  0.001931876 16.69779 12   
## [3] 0.5555556  0.001830198 16.40807 10   
## [4] 0.6333333  0.003050330 15.04549 19   
## [5] 0.5000000  0.003050330 15.03823 15
# frequency plot
itemFrequencyPlot(trans, topN = 20)

# most promising rules by lift (confidence-support scatterplot)
rules_sel <- head(sort(rules, by="lift"), 50)
plot(rules_sel, engine="plotly", control=list(col=brewer.pal(11,"Spectral")), main="")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# network representation
plot(rules_sel, method = "graph",  engine = "htmlwidget")
# rules for a selected product
rules_selprod <- subset(rules, subset = rhs %pin% "pastry")
inspect(head(sort(rules_selprod, by="lift"), 10))
##     lhs                     rhs          support confidence    coverage lift count
## [1] {citrus fruit,                                                                
##      whole milk,                                                                  
##      whipped/sour cream,                                                          
##      rolls/buns}         => {pastry} 0.001016777        0.5 0.002033554 5.62    10